# Merges homes in ZTRAX and Damage Assessment files based on Assessor Parcel Numbers.
pacman::p_load(stringdist, data.table, dplyr, fst)

## Preliminaries
rm(list = ls())

source("code/globals.R")

### -------------- Table of all data files to load and merge

allfiles <- data.frame(character(), character()) %>%
  rbind(c("dins13", "attr13")) %>%
  rbind(c("dins15", "attr15")) %>%
  rbind(c("dins17", "attr16")) %>% ## this one is tricky - i use 17 for damage and 16 for attri..
  rbind(c("dins18", "attr18")) %>%
  rbind(c("damage03_SanDiego", "attr03_SanDiego")) %>%
  rbind(c("damage07_SanDiego", "attr07_SanDiego")) %>%
  rbind(c("damage_ca_07", "attr_ca_07")) %>%
  rbind(c("damage_ca_08", "attr_ca_08")) %>%
  rbind(c("damage_ca_09", "attr_ca_09")) %>%
  rbind(c("damage_ca_12", "attr_ca_12")) %>%
  rbind(c("damage_az_13", "attr_az_13")) %>%
  rbind(c("damage_az_17", "attr_az_16")) %>%
  rbind(c("damage_co_12", "attr_co_12")) %>%
  rbind(c("damage_co_13", "attr_co_13")) %>%
  rbind(c("damage_co_16", "attr_co_16")) %>%
  rbind(c("damage_co_20", "attr_co_20")) %>%
  rbind(c("damage_or_20", "attr_or_20")) %>%
  rbind(c("damage_wa_14", "attr_wa_14")) %>%
  rbind(c("damage_wa_15", "attr_wa_15")) %>%
  rbind(c("damage_wa_20", "attr_wa_20")) %>%
  rename(damage = 1, attribute = 2)

#--------------Function to load cleaned damage files, while also confirming unique ids & adding a var used to track merge rates (inDINS)
loadcleandamage <- function(fname) {
  y <- read_fst(file.path(WORKING, paste0(fname, "_cleaned.fst")), as.data.table = T)
  stopifnot(anyDuplicated(y[, c("FIPS", "UnformattedAssessorParcelNumber", "INCIDENTNUM")]) == 0)
  y$inDINS <- 1
  return(y)
}

#------------------------ Function to load cleaned ZTRAX assessment files
loadcleanattr <- function(fname) {
  y <- read_fst(file.path(WORKING, paste0(fname, "_cleaned.fst")), as.data.table = T)
  stopifnot(anyDuplicated(y[, c("ImportParcelID", "BuildingOrImprovementNumber")]) == 0)
  return(y)
}

## --------------------------- Confirm merge field validity.
# Need to make sure that FIPS and APN uniquely identify parcels in ZTRAX for the subset of homes that appear in DINS #
# This is meant to be handled in the APN cleaning code, but double check here to be safe.
test_m_validity <- function(dinsfile, attrfile) {
  stopifnot(anyDuplicated(dinsfile, by = c("FIPS", "UnformattedAssessorParcelNumber", "INCIDENTNUM")) == 0)
  dinsfile <- dinsfile[!duplicated(dinsfile[, c("FIPS", "UnformattedAssessorParcelNumber")])] ## - remove any multiple occurrences in dins caused by same parcel in more than 1 fire
  d <- merge(attrfile, dinsfile[, .(FIPS, UnformattedAssessorParcelNumber)],
    by = c("FIPS", "UnformattedAssessorParcelNumber")
  )
  x <- nrow(d)
  y <- round(nrow(d) / nrow(dinsfile), 2) # share of DINS records that appear in ZTRAX
  w <- anyDuplicated(d[, .(FIPS, UnformattedAssessorParcelNumber)])
  r <- data.frame(valid = w == 0, parcels = x, shareinZTRAX = y)
  return(r)
}

#- The function used to merge records for a single incident
mergeOneIncident <- function(incidentnum, dinsfile, attrfile) {
  print(incidentnum)
  f <- dinsfile[dinsfile$INCIDENTNUM == incidentnum, ]
  affectedcounties <- unique(f$FIPS)
  z <- merge(
    attrfile[attrfile$FIPS %in% affectedcounties, c(
      "ImportParcelID", "BuildingOrImprovementNumber", "PropertyFullStreetAddress",
      "PropertyZip", "FIPS", "UnformattedAssessorParcelNumber", "PropertyLandUseStndCode", "YearBuilt",
      "PropertyAddressLatitude", "PropertyAddressLongitude", "PropertyStreetName", "PropertyStreetSuffix",
      "PropertyStreetPreDirectional", "PropertyStreetPostDirectional", "PropertyHouseNumber",
      "EffectiveYearBuilt", "YearRemodeled",
      "LotSizeAcres", "TotalBedrooms",
      "TotalCalculatedBathCount", "ImprovementAssessedValue", "TotalAssessedValue", "AssessmentYear",
      "RoofCoverStndCode", "sqfeet"
    )],
    f[, c(
      "FIPS", "UnformattedAssessorParcelNumber", "DAMAGE", "STRUCTURETYPE", "inDINS",
      "badrecordflag", "howmanystructures", "badstructuretype",
      "ROOFCONSTRUCTION", "EAVES", "VENTSCREEN", "EXTERIORSIDING", "WINDOWPANE",
      "DECKPORCHONGRADE", "DECKPORCHELEVATED", "PATIOCOVERCARPORT",
      "FENCEATTACHEDTOSTRUCTURE", "APN_orig", "address_orig", "lon_orig", "lat_orig", "in_damage_data"
    )],
    by = c("FIPS", "UnformattedAssessorParcelNumber"), all.x = TRUE
  )
  z$inDINS[is.na(z$inDINS)] <- 0
  z$incidentid <- incidentnum
  incname <- unique(f$INCIDENTNAME)
  stopifnot(length(incname) == 1)
  z$incidentname <- incname
  incstartdate <- unique(f$date)
  stopifnot(length(incstartdate) == 1)
  z$incidentstartdate <- incstartdate
  ## Some cleanup on attribute vars
  z$BuildingOrImprovementNumber <- as.numeric(z$BuildingOrImprovementNumber)
  z$YearBuilt <- as.numeric(z$YearBuilt)
  z$EffectiveYearBuilt <- as.numeric(z$EffectiveYearBuilt)
  z$YearRemodeled <- as.numeric(z$YearRemodeled)
  z$TotalBedrooms <- as.numeric(z$TotalBedrooms)
  z$TotalCalculatedBathCount <- as.numeric(z$TotalCalculatedBathCount)
  z$ImprovementAssessedValue <- as.numeric(z$ImprovementAssessedValue)
  z$TotalAssessedValue <- as.numeric(z$TotalAssessedValue)
  z$AssessmentYear <- as.numeric(z$AssessmentYear)
  z$sqfeet <- as.numeric(z$sqfeet)
  ## Report merge rate for this incident (share of DINS records mathced to ZTRAX)
  mergerate <- round(nrow(z[z$inDINS == 1, ]) / nrow(f), digits = 2)
  z$IncidentMergeRate <- mergerate
  affectedhomes <- nrow(f)
  z$IncidentAffectedHomes <- affectedhomes
  print(paste0(
    incidentnum, " Match Rate ", mergerate,
    " or ", nrow(z[z$inDINS == 1, ]), " out of ", nrow(f)
  ))
  ## Optional: Subset to zip codes with affected homes (first makes sure DESTROYED and the zip variable are not missing)
  ### Caveat: This shrinks data to a manageable size for geocoding, but at the cost that I probably lose some homes that would otherwise be in the sample as "not destroyed". The
  # reason is that there are about 210,000 homes that have missing Zip codes in ZTRAX (out of 7 M ZTRAX homes, so 3%).
  # On the other hand, a very large share of these homes seem to have no information for street address or year built, which would make them of little use anyway as controls.
  # A disproportionate share are classified as RR111, "Timeshare".
  treatedzipcodes <- unique(z$PropertyZip[z$inDINS == 1])
  z <- z[z$PropertyZip %in% treatedzipcodes & !is.na(z$PropertyZip), ]
  return(z)
}

#--- function to merge one state-year group:
runmerge <- function(filenum) {
  # filenum = 12 ## Debug
  print(allfiles$damage[filenum])
  ### Load the damage and attribute files
  damagefile <- loadcleandamage(allfiles$damage[filenum])
  attrfile <- loadcleanattr(allfiles$attribute[filenum])
  ### Confirm validity of merge fields
  print(test_m_validity(damagefile, attrfile))
  stopifnot(test_m_validity(damagefile, attrfile)[, 1] == TRUE)
  ### Do the merge by incident
  incidents <- unique(damagefile$INCIDENTNUM)
  list <- lapply(incidents, mergeOneIncident, dinsfile = damagefile, attrfile = attrfile)
  merged <- do.call(rbind, list)
  return(merged)
}

all <- lapply(1:nrow(allfiles), runmerge)
all <- do.call(rbind, all)

# OPtional: incident-specific inclusion cutoff based on merge rates
nrow(all[all$IncidentMergeRate >= 0.9])

## -- harmonize damage coding across DINS and non-DINS data sources
table(all$DAMAGE, useNA = "always")
all$DAMAGE[all$inDINS == 0] <- "Not in DINS data"
all$DAMAGE[all$DAMAGE == "Destroyed (>50%)"] <- "Destroyed"
all$DAMAGE[all$DAMAGE %in% c("Affected (1-9%)", "Major (26-50%)", "Minor (10-25%)")] <- "Minor"
all$DAMAGE[all$DAMAGE == "NoDamage"] <- "No Damage"
all$DAMAGE[all$DAMAGE == "Inaccessible"] <- "Unknown"
table(all$DAMAGE, useNA = "always")

### Make a main outcome variable and set equal to "not destroyed" if not in DINS. Note this is not 100% always true - some DINS homes don't merge to ZTRAX and so are lost.
all$destroyed <- all$DAMAGE == "Destroyed"
table(all$destroyed, useNA = "always")

### Make an updated year built var that reflects remodels
all$combinedyear <- all$YearBuilt
all$incyear <- year(all$incidentstartdate)
#- Use effective year built if it is newer than year built, and less than year of incident.
all$combinedyear[all$EffectiveYearBuilt > all$YearBuilt &
  all$EffectiveYearBuilt < all$incyear &
  !is.na(all$EffectiveYearBuilt) &
  !is.na(all$incyear) &
  !is.na(all$YearBuilt)] <-
  all$EffectiveYearBuilt[all$EffectiveYearBuilt > all$YearBuilt &
    all$EffectiveYearBuilt < all$incyear &
    !is.na(all$EffectiveYearBuilt) &
    !is.na(all$incyear) &
    !is.na(all$YearBuilt)]
colSums(is.na(all))


### Do some binning of years
all$yearbin <- cut(all$combinedyear,
  breaks = c(0, seq(1900, 2020, 3)),
  include.lowest = TRUE, right = FALSE, labels = FALSE
)
all$yearbin <- 1900 + 3 * (all$yearbin - 2)
all$yearbin[all$yearbin < 1945 & !is.na(all$yearbin)] <- 1945

### Create a 'state' column
all$temp_string_fips <- as.character(all$FIPS)
all$temp_string_fips[nchar(all$temp_string_fips) == 4] <-
  paste0("0", all$temp_string_fips[nchar(all$temp_string_fips) == 4])
all$State[substr(all$temp_string_fips, 1, 2) == "04"] <- "Arizona"
all$State[substr(all$temp_string_fips, 1, 2) == "06"] <- "California"
all$State[substr(all$temp_string_fips, 1, 2) == "08"] <- "Colorado"
all$State[substr(all$temp_string_fips, 1, 2) == "41"] <- "Oregon"
all$State[substr(all$temp_string_fips, 1, 2) == "53"] <- "Washington"
all$temp_string_fips <- NULL
table(all$State, useNA = "always")
stopifnot(anyNA(all$State) == 0)

######################################
#### Export file for geocoding #######
######################################
### I do this before applying various exclusions below, to make sure we have locations for all.

GeocodeData <- all[!duplicated(all[, .(ImportParcelID, BuildingOrImprovementNumber)]), ]

#### Add county name field to guide geocoding ###############
FIPS <- numeric()
cabbr <- character()
countyname <- character()
FIPS_names <- data.frame(FIPS, countyname, stringsAsFactors = FALSE)

## California county names
FIPS_names <- FIPS_names %>%
  rbind(c(6001, "Alameda")) %>%
  rbind(c(6005, "Amador")) %>%
  rbind(c(6007, "Butte")) %>%
  rbind(c(6009, "Calaveras")) %>%
  rbind(c(6013, "Contra Costa")) %>%
  rbind(c(6017, "El Dorado")) %>%
  rbind(c(6019, "Fresno")) %>%
  rbind(c(6021, "Glenn")) %>%
  rbind(c(6023, "Humboldt")) %>%
  rbind(c(6027, "Inyo")) %>%
  rbind(c(6029, "Kern")) %>%
  rbind(c(6031, "Kings")) %>%
  rbind(c(6033, "Lake")) %>%
  rbind(c(6035, "Lassen")) %>%
  rbind(c(6037, "Los Angeles")) %>%
  rbind(c(6039, "Madera")) %>%
  rbind(c(6043, "Mariposa")) %>%
  rbind(c(6045, "Mendocino")) %>%
  rbind(c(6051, "Mono")) %>%
  rbind(c(6053, "Monterey")) %>%
  rbind(c(6055, "Napa")) %>%
  rbind(c(6057, "Nevada")) %>%
  rbind(c(6059, "Orange")) %>%
  rbind(c(6061, "Placer")) %>%
  rbind(c(6063, "Plumas")) %>%
  rbind(c(6065, "Riverside")) %>%
  rbind(c(6067, "Sacramento")) %>%
  rbind(c(6069, "San Benito")) %>%
  rbind(c(6071, "San Bernardino")) %>%
  rbind(c(6073, "San Diego")) %>%
  rbind(c(6077, "San Joaquin")) %>%
  rbind(c(6079, "San Luis Obispo")) %>%
  rbind(c(6081, "San Mateo")) %>%
  rbind(c(6083, "Santa Barbara")) %>%
  rbind(c(6085, "Santa Clara")) %>%
  rbind(c(6087, "Santa Cruz")) %>%
  rbind(c(6089, "Shasta")) %>%
  rbind(c(6093, "Siskiyou")) %>%
  rbind(c(6095, "Solano")) %>%
  rbind(c(6097, "Sonoma")) %>%
  rbind(c(6099, "Stanislaus")) %>%
  rbind(c(6103, "Tehama")) %>%
  rbind(c(6105, "Trinity")) %>%
  rbind(c(6107, "Tulare")) %>%
  rbind(c(6109, "Tuolumne")) %>%
  rbind(c(6111, "Ventura")) %>%
  rbind(c(6113, "Yolo")) %>%
  rbind(c(6115, "Yuba"))

## Other states county names
FIPS_names <- FIPS_names %>%
  rbind(c(4025, "Yavapai")) %>%
  rbind(c(8041, "El Paso")) %>%
  rbind(c(8049, "Grand")) %>%
  rbind(c(8069, "Larimer")) %>%
  rbind(c(41029, "Jackson")) %>%
  rbind(c(41039, "Lane")) %>%
  rbind(c(41041, "Lincoln")) %>%
  rbind(c(41047, "Marion")) %>%
  rbind(c(53047, "Okanogan"))
names(FIPS_names) <- c("FIPS", "countyname")
FIPS_names$FIPS <- as.numeric(FIPS_names$FIPS)

GeocodeData <- merge(GeocodeData, FIPS_names, by = "FIPS", all.x = TRUE)
table(GeocodeData$countyname, useNA = "always")

stopifnot(anyNA(GeocodeData$State) == 0)
stopifnot(anyNA(GeocodeData$countyname) == 0)
write_fst(GeocodeData, file.path(WORKING, "data_to_geocode.fst"))

## Exporting a CSV for import into ArcGIS: ArcGIS Pro needs a flat file to load these addresses in order to do the address-based geocoding with StreetMap Premium.
## I have found unstable/undesirable behavior when writing a .csv on the cluster machines, even after trying to adjust the 'eol' option in write.csv().
## For safety, the workflow for geocoding involves reading the RDS on a Windows PC, writing a csv from that machine, and uploading to ArcGIS.
## See also this strange bug in ArcGIS to be aware of: https://community.esri.com/t5/arcgis-pro-questions/arcgis-pro-ignores-changes-to-csv-data/td-p/509154
# write.csv(GeocodeData,file.path(WORKING,"data_to_geocode.csv")) ## (turned off on cluster; see note directly above)

#####################################################
#### Additional sample exclusions and cleanup #######
#####################################################

### Drop homes built after the incident (or in the year of)
all <- all[all$combinedyear < all$incyear | is.na(all$combinedyear), ]
all$incyear <- NULL

### Drop some incidents where no homes were destroyed (only DINS records are for "Minor Damage", etc).
table(all$incidentid, useNA = "always")
beforedropping <- unique(all$incidentname)
all[, totdestroyed := sum(destroyed), by = incidentid]
all <- all[totdestroyed != 0, ]
table(all$incidentid, useNA = "always")
afterdropping <- unique(all$incidentname)
setdiff(beforedropping, afterdropping) # The list of fires that get dropped here.

####### Define a single family category based on land use type code
table(all$PropertyLandUseStndCode[all$inDINS == TRUE], useNA = "always")
singlefamcodes <- c(
  "RR101", # SINGLE FAMILY RESIDENTIAL
  "RR102", # RURAL RESIDENCE
  "RR104", # TOWNHOUSE
  "RR105", # CLUSTER HOME
  "RR106", # CONDOMINIUM
  "RR109", # PLANNED UNIT DEVELOPMENT
  "RR999"
)
all[, single_family := PropertyLandUseStndCode %in% singlefamcodes]

####### Drop everything except for single family homes.
# This is an important restriction - removes many mobile homes, condos, and other property types
sum(all$single_family == TRUE) / nrow(all) # Among this sample (zip codes with any homes affected), 76.8% of homes are single family
all <- all[all$single_family == TRUE, ]

####### Create a "street" variable for use in street fixed effects specifications, and for identifying neighbors in spatial analysis.
all$street <- paste(all$PropertyStreetPreDirectional,
  all$PropertyStreetName,
  all$PropertyStreetSuffix,
  all$PropertyStreetPostDirectional,
  all$PropertyZip,
  sep = "|"
)




## Handle properties with a "1/2" in the house number - replace with ".5" so they sort numerically
all$ModifiedNum <- all$PropertyHouseNumber
length(grep(".5", all$ModifiedNum, fixed = TRUE))
length(grep(" 1/2", all$ModifiedNum, fixed = TRUE))
all$ModifiedNum <- gsub(" 1/2", ".5", all$ModifiedNum, fixed = TRUE)

## Similarly, there are a tiny number of properties with "1/4" or "3/4"
all$ModifiedNum <- gsub(" 1/4", ".25", all$ModifiedNum, fixed = TRUE)
all$ModifiedNum <- gsub(" 3/4", ".75", all$ModifiedNum, fixed = TRUE)

# grep(" ",all$ModifiedNum,fixed=TRUE,value=FALSE)
housenumlist <- strsplit(all$ModifiedNum, split = "\\s+")
all$housenum <- sapply(housenumlist, "[", 1) # https://stackoverflow.com/questions/31235165/sapply-with-strsplit-in-r
nonnumericchars <- c(letters, LETTERS, "#", "/", "-", ",", "(", ")")
for (x in nonnumericchars) {
  all$housenum <- gsub(x, "", all$housenum, fixed = TRUE)
}
### stopifnot(sum(is.na(all$housenum))==sum(is.na(as.numeric(all$housenum)))) #check that there are no remaining non-numeric characters
stopifnot(sum(is.na(all$housenum)) == sum(is.na(as.numeric(all$housenum))) - 1) # Allow exactly 1 strange record (from the Shockey Fire)
all$housenum <- as.numeric(all$housenum)


# /*Calculate order on street for all homes*/
set.seed(389021)
all$tiebreaker <- runif(nrow(all))
all <- all[order(street, housenum, tiebreaker), orderonstreet := 1:.N, by = street] # https://stackoverflow.com/questions/37305422/rank-values-in-r-datatable-grouped-by-another-variable
all$streetchunk100 <- floor((all$orderonstreet - 1) / 100)
all$street100 <- as.factor(paste0(all$street, "ch", all$streetchunk100))
all$streetchunk25 <- floor((all$orderonstreet - 1) / 25)
all$street25 <- as.factor(paste0(all$street, "ch", all$streetchunk25))
all[, c("streetchunk100", "streetchunk25", "tiebreaker")] <- NULL

## Odd and even house numbers (side of street in most places)
all$even_house_num <- all$housenum %% 2 == 0
all$street25_byside <- as.factor(paste0(all$street25, "_", all$even_house_num))
all$street100_byside <- as.factor(paste0(all$street100, "_", all$even_house_num))

## Look at distribution of share of homes on street chunk destroyed (for understanding; comment out in final build)
check <- all[, sum(destroyed), by = street25]
table(check$V1, useNA = "always")
rm(check)

### Save merged dataset for analysis
FullRegDataDuplicatedControls <- all
summary(FullRegDataDuplicatedControls)
table(FullRegDataDuplicatedControls$incidentid, useNA = "always")
saveRDS(FullRegDataDuplicatedControls, file = file.path(WORKING, "FullRegDataDuplicatedControls.RDS"))

# Save a table of incident IDs and names for use in spatial analysis to FRAP perimeters
fire_ids <- FullRegDataDuplicatedControls[,
  .(AllHomes = .N, DestroyedHomes = sum(destroyed), StartDate = first(incidentstartdate)),
  by = list(incidentname, incidentid, State)
]
stopifnot(anyDuplicated(fire_ids$incidentid) == 0)
fire_ids <- fire_ids[order(fire_ids$DestroyedHomes, fire_ids$AllHomes), ]
saveRDS(fire_ids, file.path(WORKING, "fire_ids.RDS"))
